#################################################################################
# Replication file for:                                                         #
# "Balancing Precision and Retention in Experimental Design"                    #
#                                                                               #
# Gustavo Diaz                                                                  #
# Northwestern University                                                       #
# gustavo.diaz@northwestern.edu                                                 #
#                                                                               #
# Erin L. Rossiter                                                              #
# University of Notre Dame                                                      #
# erossite@nd.edu                                                               #
#                                                                               #
# This file cleans the Qualtrics survey data from the three replication         #
# experiments, produces statistics reported in Appendix E.2, and saves          #
# the cleaned data.                                                             #
#################################################################################

# Data ---- 
df_dh <- readRDS("./data/raw_data/DietrichHayesReplication.rds")
df_bg <- readRDS("./data/raw_data/BayramGrahamReplication.rds")
df_th <- readRDS("./data/raw_data/TappinHewittReplication.rds")

# Apply exclusion criteria -----

#  Appendix E.2 text

# Starting sample sizes
nrow(df_dh)
nrow(df_bg)
nrow(df_th)

# Number who failed attention check
table(df_dh$attention_check != 12 | is.na(df_dh$attention_check))
table(df_bg$attention_check != 12 | is.na(df_bg$attention_check))
table(df_th$attention_check != 12 | is.na(df_th$attention_check))

## Exclude those who failed attention check
df_dh <- df_dh %>% filter(attention_check == 12)
df_bg <- df_bg %>% filter(attention_check == 12)
df_th <- df_th %>% filter(attention_check == 12)

# Exclude people who did not indicate they were
# Black or African American (value of 2)
table(!df_dh$race %in% c(1,3,4,6,8,16)) #1.76%
df_dh <- df_dh %>%
  filter(!race %in% c(1,3,4,6,8,16))

# Exclude people who do not have a partisan
# affiliation or leaning from Tappin & Hewitt.
# Article states: "(We also analyze the data excluding Independents
# in the Appendix; the results are the same)." (pg. 54)
table(!(df_th$pid %in% c(3,4) & df_th$pid_lean == 3)) #10.58%
df_th <- df_th %>%
  filter(!(pid %in% c(3,4) & pid_lean == 3))

# Sample sizes
nrow(df_dh)
nrow(df_bg)
nrow(df_th)


# Clean DH -----

df_dh <- df_dh %>%
  # fix typo in column name
  rename(cr_sym_whiteleg = cr_sym_whitekleg) %>%
  # DV into one column
  rowwise() %>%
  mutate(speech_approve = coalesce(cr_nonsym_blackleg, cr_sym_blackleg,
                                   cr_nonsym_whiteleg, cr_sym_whiteleg,
                                   en_nonsym_blackleg, en_sym_blackleg,
                                   en_nonsym_whiteleg, en_sym_whiteleg)) %>%
  ungroup() %>%
  # scale DV like in original study
  # larger numbers = more approval, rescale to [0,1]
  mutate(speech_approve_scaled = 1 - ((speech_approve - 1) / 4)) %>%
  # clean quasi-DV same way
  mutate(dv_quasipre_scaled = 1 - ((dv_quasipre - 1) / 4)) %>%
  # treatment variable for comparison of interest
  # called Z to be comparable across studies
  mutate(Z = case_when(
    symbolic == 1 & civilrights == 1 ~ 1,
    symbolic == 0 & civilrights == 1 ~ 0,
    .default = NA
  ))


# Clean BG -----

df_bg <- df_bg %>%
  # DV into one column
  rowwise() %>%
  mutate(delpref_post = coalesce(delpref_post1, delpref_post2,
                                 delpref_post3, delpref_post4,
                                 delpref_post5)) %>%
  ungroup() %>%
  # Clean DV to 0 (US give directly), 1 (US give through IO)
  mutate(delpref_post = case_when(
    delpref_post == 1 ~ 0,
    delpref_post == 2 ~ 1,
    .default = NA
  )) %>%
  # Clean pre-treatment measure to match
  mutate(delpref_pre = case_when(
    delpref_pre == 1 ~ 0,
    delpref_pre == 2 ~ 1,
    .default = NA
  )) %>%
  # copy and rename full treatment assignment variable
  mutate(treatment = Z) %>%
  # treatment variable for comparison of interest
  # called Z to be comparable across studies
  mutate(Z = case_when(
    treatment == "control" ~ 0,
    treatment == "ear_cost" ~ 1,
    .default = NA
  ))
  


# Clean TH -----

# "recode policy opinions such that higher values indicate
# greater agreement with the in-party cue" (pg 54)
df_th <- df_th %>%
  # Trump disagrees. Rs need to be recoded so agreement with party cue
  # (e.g., 1, 2...) has larger values
  # Obama agrees. Ds do not need to be recoded. Agreement with party cue
  # is already agreement with the question
  mutate(across(c(salestax_pre, salestax_post, salestax_followup,
                  pension_pre, pension_post, pension_followup,
                  foreignaid_pre, foreignaid_post, foreignaid_followup,
                  healthcare_pre, healthcare_post, healthcare_followup), 
                ~ ifelse(pid %in% 2 | pid_lean %in% 1, 8 - ., .))) %>%
  # Obama disagrees. Ds need to be recoded so agreement with party cue
  # (e.g., 1, 2, ...) has larger values
  mutate(across(c(fedaudit_pre, fedaudit_post, fedaudit_followup), 
                ~ ifelse(pid %in% 1 | pid_lean %in% 2, 8 - ., .))) %>%
  # make outcomes between 0 and 1 for comparability with other studies
  mutate(pension_pre_scaled = scales::rescale(pension_pre, to = c(0, 1)),
         pension_post_scaled = scales::rescale(pension_post, to = c(0, 1)),
         pension_followup_scaled = scales::rescale(pension_followup, to = c(0, 1)),
         salestax_pre_scaled = scales::rescale(salestax_pre, to = c(0, 1)),
         salestax_post_scaled = scales::rescale(salestax_post, to = c(0, 1)),
         salestax_followup_scaled = scales::rescale(salestax_followup, to = c(0, 1)),
         foreignaid_pre_scaled = scales::rescale(foreignaid_pre, to = c(0, 1)),
         foreignaid_post_scaled = scales::rescale(foreignaid_post, to = c(0, 1)),
         foreignaid_followup_scaled = scales::rescale(foreignaid_followup, to = c(0, 1)),
         healthcare_pre_scaled = scales::rescale(healthcare_pre, to = c(0, 1)),
         healthcaren_post_scaled = scales::rescale(healthcare_post, to = c(0, 1)),
         healthcare_followup_scaled = scales::rescale(healthcare_followup, to = c(0, 1)),
         fedaudit_pre_scaled = scales::rescale(fedaudit_pre, to = c(0, 1)),
         fedaudit_post_scaled = scales::rescale(fedaudit_post, to = c(0, 1)),
         fedaudit_followup_scaled = scales::rescale(fedaudit_followup, to = c(0, 1))) %>%
  # Treatment variable for comparison of interest
  # called Z to be comparable across studies
  mutate(Z = cue_pension, .after = cue_pension)

# Save -----

saveRDS(df_dh, "data/processed_data/DietrichHayesReplication-clean.rds")
saveRDS(df_bg, "data/processed_data/BayramGrahamReplication-clean.rds")
saveRDS(df_th, "data/processed_data/TappinHewittReplication-clean.rds")
